
# Get complete vote-share dataset -----------------------------------------

# Read and prepare corrected vote data names
# This file contains mapping for county names to ensure consistency across datasets
cor_names_vote_dat <- read_csv2("data/raw/harm_kreis_names/correct_vote_names.csv")

# Set path to raw Bundestag (federal) election results by county
data_path <- "data/raw/btw_results_county/"
# List all CSV files in the directory (one per election year)
files <- dir(data_path, pattern = "*.csv")

# Read each raw election file into a list and clean up headers/column names
btw_kreis_raw <- map(
    .x = files,
    ~ read_delim(
        file.path(data_path, .x),
        delim = ";",
        escape_double = FALSE,
        col_names = FALSE,
        trim_ws = TRUE,
        skip = 4,  # Skip initial metadata rows
        locale = locale(encoding = "UTF-8")
    ) %>%
        janitor::row_to_names(row_number = 1) %>%  # Use first data row as header
        janitor::clean_names()  # Standardize column names
) 
nms <- str_extract(files, "\\d{4}")
btw_kreis_raw <- set_names(btw_kreis_raw, nm = nms)  # Name list elements by year


# Clean raw election data ------------------------------------------------

# Define the primary parties to keep; others collapse into "other"
key_parties <- c("cdu", "csu", "fdp", "spd", "grune", "die_linke", "afd", "drp", "npd", "rep")

# Combine all years into one data.frame and standardize party/year fields
btw_kreis_clean <- map_dfr(btw_kreis_raw, to_clean_elec_data, .id = "year") %>%
    # Separate combined "party_voteType" column into party and vote type
    separate(
        col = party,
        into = c("party", "type_vote"),
        sep = "_(?=\\d{1}$)",  # Split on underscore only if followed by a single digit
        fill = "right"
    ) %>%
    mutate(
        # Infer vote type for years before/after reunification if missing
        type_vote = case_when(
            year < 1990 & is.na(type_vote) ~ "zweit",  # Pre-1990 Zweitstimme
            year >= 1990 & is.na(type_vote) ~ "erst",   # Post-1990 Erststimme
            TRUE ~ "zweit"
        ),
        # Correct party naming inconsistencies
        party = case_when(
            party == "f_d_p" ~ "fdp",
            party == "af_d" ~ "afd",
            party == "pds" ~ "die_linke",
            party == "b90_gr" ~ "grune",
            TRUE ~ party
        ),
        # Lump minor parties into "other"
        party = if_else(party %in% key_parties, party, "other"),
        # Map state codes to abbreviations
        land = case_when(
            land == "1" ~ "SH",
            land == "2" ~ "HH",
            land == "3" ~ "NI",
            land == "4" ~ "HB",
            land == "5" ~ "NW",
            land == "6" ~ "HE",
            land == "7" ~ "RP",
            land == "8" ~ "BW",
            land == "9" ~ "BY",
            land == "10" ~ "SL",
            TRUE ~ land
        )
    )

# Pivot to wide format: each party & vote type becomes its own column
btw_kreis_clean <- pivot_wider(
    btw_kreis_clean,
    id_cols = c(1:8, starts_with("statistische")),  # Keep metadata columns
    names_from = c("party", "type_vote"),
    names_sep = "_",
    values_from = votes,
    values_fn = sum,    # Sum votes when multiple entries exist
    values_fill = NA    # Fill missing with NA
)

# Translate column names from German to English for clarity
btw_kreis_clean <- btw_kreis_clean %>%
    rename(
        state = land,
        name1 = kreisfreie_stadt_bzw_stadtkreis_landkreis_bzw_kreis,
        elig = wahlberechtigte,
        voters = wahler,
        invalid = ungultige,
        valid = gultige
    ) %>%
    # Fix specific duplicate or formatting issues in names
    mutate(
        name1 = case_when(
            name1 == "Friedberg" & state == "BY" ~ "Friedberg, Stadt",
            name1 == "Konstanz, Stadt" ~ "Konstanz",
            name1 == "Lüdenscheid, Stadt" ~ "Lüdenscheid",
            str_detect(name1, "Neuß") ~ "Neuss",
            TRUE ~ name1
        ),
        year = as.numeric(year)
    ) %>%
    # Exclude special entries like postal voting aggregates
    filter(!str_detect(name1, "Briefwahl|zuordenbar"))

# Merge corrected names and select the appropriate AGS code for each county
btw_kreis_clean <- left_join(btw_kreis_clean, cor_names_vote_dat) %>%
    mutate(
        name = if_else(is.na(name2), name1, name2),
        ags = if_else(
            is.na(statistische_kennziffer),
            statistische_kennziffer_bundestagswahl_1990,
            statistische_kennziffer
        )
    ) %>%
    select(-name1, -name2, -statistische_kennziffer, -statistische_kennziffer_bundestagswahl_1990) %>%
    group_by(ags, name, year) %>%
    summarise(across(where(is.numeric), sum), .groups = "drop")


# Get Geodata --------------------------------------------------------------

# Read mapping for shape file names to correct naming inconsistencies
cor_names_shp_dat <- read_csv2("data/raw/harm_kreis_names/correct_shapenames.csv")

# Define path to spatial shapefiles for counties over time
path <- "data/raw/elec_counties_1953_2017/"
# List layers (one per snapshot year) in the shapefile directory
layers <- st_layers(path)

# Read and transform each shapefile to a common CRS (EPSG:5243)
list_of_files <- purrr::map(layers$name, ~ st_read(dsn = path, layer = .))
list_of_files <- purrr::map(
    list_of_files,
    ~ st_transform(., 5243, allow_ballpark = FALSE)
)
# Name each element of the list by its year
names(list_of_files) <- str_extract(layers$name, "[[:digit:]]{4}")

# Combine yearly shapefiles and clean names & geometry
kreis_geo_dat <- list_of_files %>%
    bind_rows(.id = "year") %>%
    group_by(year) %>%
    arrange(desc(AREA)) %>%  # Ensure smaller polygons (cities) get distinguished
    mutate(
        # Append ", Stadt" to duplicate names to mark urban districts
        GEN = if_else(duplicated(GEN), str_c(GEN, ", Stadt"), GEN),
        # Correct specific year labels for consistency
        year = case_when(
            year == "1981" ~ "1983",
            year == "1984" ~ "1987",
            year == "2001" ~ "2002",
            year == "2004" ~ "2005",
            TRUE ~ year
        ),
        # Standardize geometry type
        geometry = st_cast(geometry, "MULTIPOLYGON")
    ) %>%
    clean_names() %>%
    ungroup()

# Join election results with spatial data ----------------------------------

kreis_geo_dat_clean <- kreis_geo_dat %>%
    mutate(
        year = as.numeric(year),
        # Extract the original AGS code from the spatial data
        ags_orig = str_sub(kreis_kenn, end = -4L),
        ags_orig = if_else(
            is.na(ags_orig),
            str_sub(ags, end = 5L),
            ags_orig
        )
    ) %>%
    filter(
        # Exclude East German districts before 1990 and others based on AGS codes
        case_when(
            year == 1953 ~ ags_orig < 10000,
            year < 1990 & year > 1953 ~ ags_orig < 11000,
            TRUE ~ TRUE
        )
    ) %>%
    select(year, gen, ags_orig, geometry) %>%
    # Apply corrected shape names
    left_join(cor_names_shp_dat) %>%
    mutate(gen = if_else(is.na(gen2), gen, gen2)) %>%
    select(-gen2)


# Read in Covariates --------------------------------------------------------

# Load pre-computed covariates for all counties (nuts3 and AGS) from RDS
kreis_covs <- read_rds("data/raw/covariates/all_covariates.rds")

# Read additional CSV covariate files for various indicators
data_paths <- fs::dir_ls("data/raw/covariates/", regexp = ".csv$")

# Helper function read_ger_dat assumed to handle format specifics for each CSV
dat_ls <- map(data_paths, read_ger_dat)

# Merge all NUTS3-level covariate datasets
ger_nuts3_covs <- reduce(dat_ls, left_join)
ger_nuts3_covs_cl <- ger_nuts3_covs %>%
    # Focus on agriculture (A) and manufacturing (B-E) sectors
    filter(sector %in% c("A", "B-E")) %>%
    # Convert employment counts from thousands to persons
    mutate(across(contains("thousands_persons"), ~ .x * 1000)) %>%
    rename(
        nuts_3 = territory_id,
        gdp_capita_pps = shvgdp_pps,
        tot_emp = snetd_thousands_persons,
        emp_by_nace = snetz_thousands_persons,
        tot_pop = snptn_persons,
        gdp_growth_rt_perc = spvge_percent,
        gva_tot_mil_eur = suvge_million_eur,
        gva_by_nace_mil_eur = suvgz_million_eur
    ) %>%
    pivot_wider(
        names_from = sector,
        values_from = c(emp_by_nace, gva_by_nace_mil_eur)
    ) %>%
    # Derive additional indicators: shares, logs
    mutate(
        emp_sh = tot_emp / tot_pop,
        log_pop = log(tot_pop),
        log_gdp_capita_pps = log(gdp_capita_pps),
        log_gva = log(gva_tot_mil_eur),
        agri_emp_sh = (emp_by_nace_A / tot_emp) * 100,
        manu_emp_sh = (`emp_by_nace_B-E` / tot_emp) * 100,
        agri_gva_sh = (gva_by_nace_mil_eur_A / gva_tot_mil_eur) * 100,
        manu_gva_sh = (`gva_by_nace_mil_eur_B-E` / gva_tot_mil_eur) * 100
    )

# Load concordance between NUTS3 and AGS region codes ----------------------

conc_nuts_ags <- readxl::read_excel(
    "data/raw/04-kreise.xlsx", sheet = 2
)

# Merge NUTS3 covariates with AGS concordance and adjust year labels
ger_kreis_covs_cl <- left_join(ger_nuts3_covs_cl, conc_nuts_ags) %>%
    mutate(
        state_id = str_sub(ags, end = 2L),
        year = case_when(
            as.numeric(state_id) > 10 & year == 1991 ~ 1990,
            TRUE ~ year
        )
    )

# Save final covariate dataset for counties
write_rds(ger_kreis_covs_cl, "data/raw/covariates/kreis_covs.rds")

